In [1]:
# load packages
import numpy as np
import pandas as pd; pd.set_option('display.max_rows', 1000); pd.set_option('display.max_columns', 500)
import matplotlib.pyplot as plt
import seaborn as sns; sns.set(color_codes=True)
import pickle

import geopandas as gpd # http://geopandas.org/gallery/plotting_basemap_background.html#sphx-glr-gallery-plotting-basemap-background-py
import contextily as ctx

import shapely.wkt # https://stackoverflow.com/questions/51855917/shapely-polygon-from-string
from shapely.geometry import Point, LineString, MultiLineString, Polygon
from shapely.ops import cascaded_union
In [3]:
df_col = pd.read_csv('data/NYPD_Motor_Vehicle_Collisions_-_Crashes.zip')

# make column names lower-case
df_col.rename(columns={c:c.lower() for c in df_col.columns}, inplace=True)

# convert data type
df_col['location'] = df_col['location'].apply(lambda x: shapely.wkt.loads(x) if not pd.isnull(x) else np.nan)

# correct wrong data
df_col.loc[
    ((40.489 > df_col['latitude']) | (df_col['latitude'] > 40.924) |  # NYC boundary
     (-74.272 > df_col['longitude']) | (df_col['longitude'] > -73.689)) # NYC boundary
    , ['latitude','longitude','location']    
] = np.nan
C:\Users\caffr\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py:3049: DtypeWarning: Columns (3) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
In [63]:
# functions
def Plot_Injuries_vs_Year(df_col, col='road_type', hue='is_bike', injured_killed='injured'):
    pp = df_col.groupby(list(set([c for c in ['year',col,hue] if pd.notnull(c)]))).sum().reset_index(drop=False)
    pp = pp.loc[(pp['year'] >= 2013) & (pp['year'] <= 2018)] # only year = [2013,2014,2015,2016,2017,2018]

    if pd.notnull(col) & pd.notnull(hue):
        sns.lmplot(x='year', y="number of persons "+injured_killed, col=col, hue=hue, data=pp, col_wrap=10, aspect=0.5) # https://seaborn.pydata.org/tutorial/regression.html
        sns.lmplot(x='year', y="number of motorist "+injured_killed, col=col, hue=hue, data=pp, col_wrap=10, aspect=0.5) # https://seaborn.pydata.org/tutorial/regression.html
        sns.lmplot(x='year', y="number of pedestrians "+injured_killed, col=col, hue=hue, data=pp, col_wrap=10, aspect=0.5) # https://seaborn.pydata.org/tutorial/regression.html
        sns.lmplot(x='year', y="number of cyclist "+injured_killed, col=col, hue=hue, data=pp, col_wrap=10, aspect=0.5) # https://seaborn.pydata.org/tutorial/regression.html

    else:
        pp = pd.melt(pp, 
             id_vars=['year'], 
             value_vars=['number of persons '+injured_killed,'number of motorist '+injured_killed,'number of pedestrians '+injured_killed,'number of cyclist '+injured_killed],
             var_name='type', 
             value_name='number of '+injured_killed)
        sns.lmplot(x='year', y="number of "+injured_killed, hue='type', data=pp, aspect=1.2) # https://seaborn.pydata.org/tutorial/regression.html

    plt.show()

def Plot_Geoms_On_NYCMap(geoms, ON_MAP=True):
    #ref) https://towardsdatascience.com/geopandas-101-plot-any-data-with-a-latitude-and-longitude-on-a-map-98e01944b972
    # df = gpd.GeoDataFrame(geometry=df_col['location'].dropna().loc[1:1000], crs={'init':'epsg:4326'}) 
    df = gpd.GeoDataFrame(geometry=geoms, crs={'init':'epsg:4326'}) 
    df = df.to_crs(epsg=3857) # convert data to the same CRS to combine our polygons and background tiles in the same map

    # NYC borough boundaries
    #ref) http://geopandas.org/gallery/plotting_basemap_background.html#sphx-glr-gallery-plotting-basemap-background-py
    df_nyc = gpd.read_file(gpd.datasets.get_path('nybb')) # NYC borough boundary data that is available in geopandas datasets
    df_nyc = df_nyc.to_crs(epsg=3857) # convert data to the same CRS to combine our polygons and background tiles in the same map

    #--- plot --------------------
    fig, ax = plt.subplots(figsize=(15,15))
    df_nyc.plot(ax=ax, color='none', edgecolor='b', alpha=1.0, linewidth=1.5) # NYC borough boundaries
    df.plot(ax=ax, color='r', edgecolor='r', alpha=0.2, markersize=1)
    if ON_MAP:
#         ctx.add_basemap(ax)
#         ctx.add_basemap(ax, zoom=12)
        ctx.add_basemap(ax, url=ctx.providers.Stamen.TonerLite)
    ax.set_axis_off()
    
# Plot_Geoms_On_NYCMap(df_col.loc[df_col['on street name'].isnull(),'location'].dropna())    

#-------------------------------------------------------------------------------
# from nltk.metrics.distance import edit_distance
# # http://maf.directory/zp4/abbrev.html
# word = 'lane'
# ww = pd.DataFrame({'word':aa, 'count':0}).groupby('word').count().reset_index().sort_values(by='count', ascending=False)
# ww['ii'] = ww['word'].apply(lambda w: word[0] != w[0])
# ww['ss'] = ww['word'].apply(lambda w: edit_distance(word, w, transpositions = True))
# ww.sort_values(by=['ii','ss'])

def find_road_type(streetname):
    if len(set(streetname.lower().split()).intersection(['expressway','expressay','expresswaay','expresway','exressway','express','expway','expwy','expre','exway','expy','exwy','exwpy','exp','exp.'])) > 0:
        return 'expressway'
    elif len(set(streetname.lower().split()).intersection(['parkway','parrkway','parlway','parkwway','parkwayy','parkwary','prkwy','pkwy'])) > 0:
        return 'parkway'
#     elif len(set(streetname.lower().split()).intersection(['highway','hwy'])) > 0:
#         return 'highway'
    elif len(set(streetname.lower().split()).intersection(['drive','dr'])) > 0:
        return 'drive'
#     elif ((len(set(streetname.lower().split()).intersection(['drive','dr'])) > 0) &
#           (len(set(streetname.lower().split()).intersection(['fdr'])) == 0)):
#         return 'drive'
#     elif len(set(streetname.lower().split()).intersection(['fdr'])) > 0:
#         return 'fdr drive'
#     elif len(set(streetname.lower().split()).intersection(['broadway'])) > 0:
#         return 'broadway'
#     elif len(set(streetname.lower().split()).intersection(['turnpike','tpke'])) > 0:
#         return 'turnpike'
#     elif len(set(streetname.lower().split()).intersection(['thruway','thruwy','thurway','throughway'])) > 0:
#         return 'thruway'
#     elif len(set(streetname.lower().split()).intersection(['concourse','concours'])) > 0:
#         return 'concourse'
    elif len(set(streetname.lower().split()).intersection(['bridge','bridge/','bridge,','bridgte','brdg','brg'])) > 0:
        return 'bridge'
    
    elif len(set(streetname.lower().split()).intersection(['street','st','st.'])) > 0:
        return 'street'
#     elif len(set(streetname.lower().split()).intersection(['boulevard','blvd'])) > 0:
#         return 'boulevard'
    elif len(set(streetname.lower().split()).intersection(['avenue','ave'])) > 0:
        return 'avenue'
    elif len(set(streetname.lower().split()).intersection(['road','rd'])) > 0:
        return 'road'
    else:
        return 'others'
    

def count_by_year_borough(df, col1 = None, col2 = None, injured_killed='injured'):
    #df1 = df.loc[(df['year'] >=2012) & (df['year'] <= 2015)]
    #df2 = df.loc[(df['year'] >=2016) & (df['year'] <= 2019)]
    
    if pd.notnull(col1) & pd.notnull(col2):
        #tab1 = df1.groupby([col1, col2])['number of persons ' + injured_killed, 'number of pedestrians ' + injured_killed,'number of cyclist ' + injured_killed, 'number of motorist ' + injured_killed].sum()
        #tab2 = df2.groupby([col1, col2])['number of persons ' + injured_killed, 'number of pedestrians ' + injured_killed,'number of cyclist ' + injured_killed, 'number of motorist ' + injured_killed].sum()
        #return(tab1, tab2)
        return(df.groupby([col1, col2])['number of persons ' + injured_killed, 'number of pedestrians ' + injured_killed,'number of cyclist ' + injured_killed, 'number of motorist ' + injured_killed].sum())
    else:
        #tab1 = df1.groupby(['year', 'borough'])['number of pedestrians ' + injured_killed,'number of cyclist ' + injured_killed, 'number of motorist ' + injured_killed].sum()
        #tab2 = df2.groupby(['year', 'borough'])['number of pedestrians ' + injured_killed,'number of cyclist ' + injured_killed, 'number of motorist ' + injured_killed].sum()
        #return(tab1, tab2)
        return(df.groupby(['year', 'borough'])['number of pedestrians ' + injured_killed,'number of cyclist ' + injured_killed, 'number of motorist ' + injured_killed].sum())
    

def pivot_table(df, injured_killed = 'injured',index= 'borough'):
    tab = pd.pivot_table(test, values=['number of persons ' + injured_killed,'number of pedestrians ' + injured_killed, 'number of cyclist ' + injured_killed,'number of motorist ' + injured_killed], index=[index], aggfunc=np.sum)
    return(tab)
In [5]:
# impute data : borough, zip code, on street name
def Load_New_Data_From_OpenStreetMap():
    zz = []
    for n in range(0,35):
        print(n)
        zz.append(pd.read_pickle('data/openstreetmap/geopy_(%s).pickle'%n))
    zz = pd.concat(zz, axis=0).reset_index(drop=True)
    print(zz.shape)
    zz = zz.drop_duplicates(subset='unique id')
    print(zz.shape)
    print(zz.info())
    zz.head()
    
    aa = zz['geopy'].apply(lambda x: x['address']['county'].upper()).map({
        'NEW YORK COUNTY':'MANHATTAN', 
        'BRONX COUNTY':'BRONX', 
        'QUEENS COUNTY':'QUEENS', 
        'KINGS COUNTY':'BROOKLYN',
        'RICHMOND COUNTY':'STATEN ISLAND'})
    bb = zz['geopy'].apply(lambda x: x['address']['postcode'][0:5].replace('1122O','11220')
                           if 'postcode' in x['address'].keys() else np.nan).apply(lambda y: int(y) if (y not in ['(718)','NY 11','NY 10']) & (not pd.isnull(y)) else np.nan)
    cc = zz['geopy'].apply(lambda x: x['address']['road'].upper()
                           if 'road' in x['address'].keys() else np.nan)

    zz.loc[zz['borough'].isnull(),'borough'] = aa[zz['borough'].isnull()]
    zz.loc[zz['zip code'].isnull(),'zip code'] = bb[zz['zip code'].isnull()]
    zz.loc[zz['on street name'].isnull(),'on street name'] = cc[zz['on street name'].isnull()]

    print(zz.info())
    zz.head()   
    return zz

zz = Load_New_Data_From_OpenStreetMap()
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
(482606, 7)
(482606, 7)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 482606 entries, 0 to 482605
Data columns (total 7 columns):
unique id         482606 non-null int64
latitude          482606 non-null float64
longitude         482606 non-null float64
borough           177468 non-null object
zip code          177298 non-null object
on street name    231399 non-null object
geopy             482606 non-null object
dtypes: float64(2), int64(1), object(4)
memory usage: 29.5+ MB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 482606 entries, 0 to 482605
Data columns (total 7 columns):
unique id         482606 non-null int64
latitude          482606 non-null float64
longitude         482606 non-null float64
borough           482288 non-null object
zip code          481442 non-null object
on street name    479847 non-null object
geopy             482606 non-null object
dtypes: float64(2), int64(1), object(4)
memory usage: 29.5+ MB
None
In [6]:
df_col.set_index('unique id', inplace=True)
df_col.loc[zz['unique id'],['borough','zip code','on street name']] = zz[['unique id','borough','zip code','on street name']].set_index('unique id')
df_col.reset_index(inplace=True)

print(df_col.shape)
print(df_col.info())
df_col.head()
(1558237, 29)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1558237 entries, 0 to 1558236
Data columns (total 29 columns):
unique id                        1558237 non-null int64
date                             1558237 non-null object
time                             1558237 non-null object
borough                          1392954 non-null object
zip code                         1392091 non-null object
latitude                         1364727 non-null float64
longitude                        1364727 non-null float64
location                         1364727 non-null object
on street name                   1503396 non-null object
cross street name                1044964 non-null object
off street name                  211127 non-null object
number of persons injured        1558220 non-null float64
number of persons killed         1558206 non-null float64
number of pedestrians injured    1558237 non-null int64
number of pedestrians killed     1558237 non-null int64
number of cyclist injured        1558237 non-null int64
number of cyclist killed         1558237 non-null int64
number of motorist injured       1558237 non-null int64
number of motorist killed        1558237 non-null int64
contributing factor vehicle 1    1554172 non-null object
contributing factor vehicle 2    1349807 non-null object
contributing factor vehicle 3    100674 non-null object
contributing factor vehicle 4    20907 non-null object
contributing factor vehicle 5    5343 non-null object
vehicle type code 1              1553172 non-null object
vehicle type code 2              1303443 non-null object
vehicle type code 3              129925 non-null object
vehicle type code 4              48725 non-null object
vehicle type code 5              10452 non-null object
dtypes: float64(4), int64(7), object(18)
memory usage: 344.8+ MB
None
Out[6]:
unique id date time borough zip code latitude longitude location on street name cross street name off street name number of persons injured number of persons killed number of pedestrians injured number of pedestrians killed number of cyclist injured number of cyclist killed number of motorist injured number of motorist killed contributing factor vehicle 1 contributing factor vehicle 2 contributing factor vehicle 3 contributing factor vehicle 4 contributing factor vehicle 5 vehicle type code 1 vehicle type code 2 vehicle type code 3 vehicle type code 4 vehicle type code 5
0 4194003 08/24/2019 11:53 NaN NaN NaN NaN NaN CROSS BRONX EXPY RAMP NaN NaN 3.0 0.0 0 0 0 0 3 0 Other Vehicular Aggressive Driving/Road Rage NaN NaN NaN Sedan Sedan NaN NaN NaN
1 4194562 08/24/2019 13:22 NaN NaN NaN NaN NaN VERRAZANO BRIDGE LOWER NaN NaN 0.0 0.0 0 0 0 0 0 0 Reaction to Uninvolved Vehicle Unspecified NaN NaN NaN Station Wagon/Sport Utility Vehicle Sedan NaN NaN NaN
2 4194211 08/24/2019 23:00 NaN NaN NaN NaN NaN MEEKER AVENUE NaN NaN 0.0 0.0 0 0 0 0 0 0 Passing Too Closely Unspecified NaN NaN NaN Sedan Motorcycle NaN NaN NaN
3 4193896 08/24/2019 23:50 NaN NaN NaN NaN NaN MERIDIAN RD NEW YORK AVE NaN 0.0 0.0 0 0 0 0 0 0 View Obstructed/Limited NaN NaN NaN NaN Station Wagon/Sport Utility Vehicle NaN NaN NaN NaN
4 4193507 08/23/2019 0:12 MANHATTAN 10031 40.825134 -73.95141 POINT (-73.95141 40.825134) BROADWAY NaN NaN 0.0 0.0 0 0 0 0 0 0 Unsafe Lane Changing Unspecified NaN NaN NaN Pick-up Truck Sedan NaN NaN NaN
In [11]:
# create new columns

df_col['date_time'] = pd.to_datetime(df_col['date'] + ' ' + df_col['time'], format= '%m/%d/%Y %H:%M')
df_col['year'] = df_col['date_time'].dt.year
df_col['month'] = df_col['date_time'].dt.month
df_col['dayofweek'] = df_col['date_time'].dt.dayofweek
df_col['hour'] = df_col['date_time'].dt.hour

df_col['is_borough_null'] = df_col['borough'].isnull()
df_col['borough'].fillna('N/A', inplace = True)

df_col['is_location_null'] = df_col['location'].isnull()

df_col['road_type'] = df_col['on street name'].apply(lambda x: find_road_type(x) if not pd.isnull(x) else 'N/A')
df_col['is_expressway'] = df_col['road_type'].apply(lambda x: x in ['bridge', 'drive', 'expressway', 'parkway'])

Assumption 1: in order for an accident to be categorized as a KSI, pedesrtains or cyclist must collide with a car.

In [26]:
# KSI

# 'Pedestrian/Bicyclist/Other Pedestrian Error/Confusion'
#list(df_col['contributing factor vehicle 1'])

# 'number of motorist killed', 'number of motorist injured'
print("number of KSI killed :", df_col.loc[df_col['contributing factor vehicle 1'] == 'Pedestrian/Bicyclist/Other Pedestrian Error/Confusion']['number of persons killed'].sum()
)
print("number of KSI injured :", df_col.loc[df_col['contributing factor vehicle 1'] == 'Pedestrian/Bicyclist/Other Pedestrian Error/Confusion']['number of persons injured'].sum()
)
# df_col['ksi'] = df_col.loc[df_col['contributing factor vehicle 1'] == 'Pedestrian/Bicyclist/Other Pedestrian Error/Confusion']
number of KSI killed : 46.0
number of KSI injured : 3892.0
In [35]:
df_ksi = df_col.loc[df_col['contributing factor vehicle 1'] == 'Pedestrian/Bicyclist/Other Pedestrian Error/Confusion']
#df_ksi.head(3)
INJURED
KILLED
Out[35]:
<seaborn.axisgrid.FacetGrid at 0x17b3e68add8>
In [108]:
Plot_Injuries_vs_Year(df_ksi, col=None, hue=None, injured_killed='injured')
In [109]:
Plot_Injuries_vs_Year(df_ksi, col=None, hue=None, injured_killed='killed')
In [204]:
a = count_by_year_borough(df_ksi, col1='year',col2= 'borough',injured_killed= 'injured')
a.sort_values(by = 'number of persons injured', ascending = False)
Out[204]:
number of persons injured number of pedestrians injured number of cyclist injured number of motorist injured
year borough
2017 BROOKLYN 368.0 158 190 21
2016 BROOKLYN 357.0 139 192 26
2018 BROOKLYN 333.0 124 177 32
2016 MANHATTAN 316.0 138 166 12
2017 MANHATTAN 309.0 137 151 19
2018 MANHATTAN 265.0 114 136 15
BRONX 217.0 111 79 26
2017 BRONX 212.0 115 77 20
2019 MANHATTAN 180.0 64 101 15
2016 BRONX 179.0 97 68 14
2019 BROOKLYN 173.0 65 92 16
2017 QUEENS 155.0 53 96 7
2018 QUEENS 154.0 63 79 12
2016 QUEENS 151.0 56 88 7
2019 BRONX 109.0 59 33 17
QUEENS 85.0 26 54 5
2016 N/A 65.0 22 36 7
2017 N/A 55.0 22 31 2
2018 N/A 41.0 14 27 0
2019 N/A 37.0 10 26 1
2018 STATEN ISLAND 26.0 14 10 2
2017 STATEN ISLAND 25.0 13 10 2
2016 STATEN ISLAND 22.0 13 8 1
2019 STATEN ISLAND 16.0 10 5 1
2014 BROOKLYN 5.0 0 0 5
2015 MANHATTAN 5.0 0 0 5
2013 BRONX 5.0 0 0 5
2012 BROOKLYN 4.0 0 0 4
2015 N/A 4.0 0 0 4
2014 MANHATTAN 3.0 0 0 3
QUEENS 2.0 0 0 2
BRONX 2.0 0 0 2
2013 MANHATTAN 2.0 0 0 2
N/A 1.0 0 0 1
2012 MANHATTAN 1.0 0 0 1
QUEENS 1.0 0 0 1
2015 QUEENS 1.0 0 0 1
2013 BROOKLYN 1.0 0 1 0
2012 BRONX 1.0 0 0 1
2015 BROOKLYN 1.0 0 0 1
2014 N/A 1.0 0 0 1
STATEN ISLAND 1.0 1 0 0
2015 BRONX 1.0 0 0 1
2013 QUEENS 0.0 0 0 0
2015 STATEN ISLAND 0.0 0 0 0
2012 STATEN ISLAND 0.0 0 0 0
2013 STATEN ISLAND 0.0 0 0 0
In [126]:
count_by_year_borough(df_ksi, col1='year',col2= 'road_type',injured_killed= 'injured')
Out[126]:
number of persons injured number of pedestrians injured number of cyclist injured number of motorist injured
year road_type
2012 avenue 4.0 0 0 4
others 1.0 0 0 1
road 1.0 0 0 1
street 1.0 0 0 1
2013 N/A 1.0 0 0 1
avenue 4.0 0 0 4
others 1.0 0 0 1
parkway 0.0 0 0 0
road 0.0 0 0 0
street 3.0 0 1 2
2014 N/A 0.0 0 0 0
avenue 8.0 0 0 8
drive 0.0 0 0 0
others 2.0 0 0 2
parkway 0.0 0 0 0
road 0.0 0 0 0
street 4.0 1 0 3
2015 N/A 3.0 0 0 3
avenue 2.0 0 0 2
expressway 1.0 0 0 1
others 0.0 0 0 0
parkway 0.0 0 0 0
road 0.0 0 0 0
street 6.0 0 0 6
2016 N/A 30.0 12 16 2
avenue 464.0 208 227 29
bridge 11.0 4 7 0
drive 21.0 3 17 1
expressway 3.0 0 3 0
others 118.0 46 62 10
parkway 22.0 9 9 4
road 41.0 19 22 0
street 380.0 164 195 21
2017 N/A 34.0 11 22 1
avenue 480.0 229 226 26
bridge 9.0 3 6 0
drive 8.0 5 3 0
expressway 2.0 1 1 0
others 120.0 54 53 13
parkway 17.0 5 10 2
road 36.0 14 21 2
street 418.0 176 213 27
2018 N/A 20.0 7 12 1
avenue 452.0 189 220 43
bridge 2.0 0 2 0
drive 19.0 6 8 5
expressway 4.0 3 1 0
others 112.0 44 58 10
parkway 21.0 9 11 1
road 58.0 36 17 4
street 348.0 146 179 23
2019 N/A 20.0 4 16 0
avenue 267.0 102 139 26
bridge 2.0 0 2 0
drive 12.0 6 6 0
expressway 2.0 1 1 0
others 63.0 25 31 7
parkway 8.0 2 5 1
road 23.0 11 6 6
street 203.0 83 105 15
In [64]:
Plot_Injuries_vs_Year(df_ksi, col='road_type', hue='road_type', injured_killed='injured')
In [125]:
count_by_year_borough(df_ksi, injured_killed= 'injured')
Out[125]:
number of pedestrians injured number of cyclist injured number of motorist injured
year borough
2012 BRONX 0 0 1
BROOKLYN 0 0 4
MANHATTAN 0 0 1
QUEENS 0 0 1
STATEN ISLAND 0 0 0
2013 BRONX 0 0 5
BROOKLYN 0 1 0
MANHATTAN 0 0 2
N/A 0 0 1
QUEENS 0 0 0
STATEN ISLAND 0 0 0
2014 BRONX 0 0 2
BROOKLYN 0 0 5
MANHATTAN 0 0 3
N/A 0 0 1
QUEENS 0 0 2
STATEN ISLAND 1 0 0
2015 BRONX 0 0 1
BROOKLYN 0 0 1
MANHATTAN 0 0 5
N/A 0 0 4
QUEENS 0 0 1
STATEN ISLAND 0 0 0
2016 BRONX 97 68 14
BROOKLYN 139 192 26
MANHATTAN 138 166 12
N/A 22 36 7
QUEENS 56 88 7
STATEN ISLAND 13 8 1
2017 BRONX 115 77 20
BROOKLYN 158 190 21
MANHATTAN 137 151 19
N/A 22 31 2
QUEENS 53 96 7
STATEN ISLAND 13 10 2
2018 BRONX 111 79 26
BROOKLYN 124 177 32
MANHATTAN 114 136 15
N/A 14 27 0
QUEENS 63 79 12
STATEN ISLAND 14 10 2
2019 BRONX 59 33 17
BROOKLYN 65 92 16
MANHATTAN 64 101 15
N/A 10 26 1
QUEENS 26 54 5
STATEN ISLAND 10 5 1
In [67]:
Plot_Injuries_vs_Year(df_ksi, col='borough', hue='borough', injured_killed='injured')
In [127]:
count_by_year_borough(df_ksi, col1 = 'year', col2= 'road_type', injured_killed= 'killed')
Out[127]:
number of persons killed number of pedestrians killed number of cyclist killed number of motorist killed
year road_type
2012 avenue 0.0 0 0 0
others 0.0 0 0 0
road 0.0 0 0 0
street 0.0 0 0 0
2013 N/A 0.0 0 0 0
avenue 0.0 0 0 0
others 0.0 0 0 0
parkway 0.0 0 0 0
road 0.0 0 0 0
street 0.0 0 0 0
2014 N/A 0.0 0 0 0
avenue 0.0 0 0 0
drive 0.0 0 0 0
others 0.0 0 0 0
parkway 0.0 0 0 0
road 0.0 0 0 0
street 0.0 0 0 0
2015 N/A 0.0 0 0 0
avenue 0.0 0 0 0
expressway 0.0 0 0 0
others 0.0 0 0 0
parkway 0.0 0 0 0
road 0.0 0 0 0
street 0.0 0 0 0
2016 N/A 0.0 0 0 0
avenue 4.0 4 0 0
bridge 0.0 0 0 0
drive 0.0 0 0 0
expressway 3.0 3 0 0
others 9.0 5 3 1
parkway 2.0 1 1 0
road 0.0 0 0 0
street 1.0 1 0 0
2017 N/A 0.0 0 0 0
avenue 5.0 0 5 0
bridge 0.0 0 0 0
drive 0.0 0 0 0
expressway 1.0 1 0 0
others 1.0 0 1 0
parkway 1.0 1 0 0
road 2.0 2 0 0
street 4.0 3 1 0
2018 N/A 0.0 0 0 0
avenue 5.0 5 0 0
bridge 0.0 0 0 0
drive 0.0 0 0 0
expressway 1.0 1 0 0
others 1.0 1 0 0
parkway 0.0 0 0 0
road 0.0 0 0 0
street 1.0 1 0 0
2019 N/A 0.0 0 0 0
avenue 4.0 3 1 0
bridge 0.0 0 0 0
drive 0.0 0 0 0
expressway 0.0 0 0 0
others 0.0 0 0 0
parkway 1.0 1 0 0
road 0.0 0 0 0
street 0.0 0 0 0
In [65]:
Plot_Injuries_vs_Year(df_ksi, col='road_type', hue='road_type', injured_killed='killed')
In [128]:
count_by_year_borough(df_ksi, injured_killed= 'killed')
Out[128]:
number of pedestrians killed number of cyclist killed number of motorist killed
year borough
2012 BRONX 0 0 0
BROOKLYN 0 0 0
MANHATTAN 0 0 0
QUEENS 0 0 0
STATEN ISLAND 0 0 0
2013 BRONX 0 0 0
BROOKLYN 0 0 0
MANHATTAN 0 0 0
N/A 0 0 0
QUEENS 0 0 0
STATEN ISLAND 0 0 0
2014 BRONX 0 0 0
BROOKLYN 0 0 0
MANHATTAN 0 0 0
N/A 0 0 0
QUEENS 0 0 0
STATEN ISLAND 0 0 0
2015 BRONX 0 0 0
BROOKLYN 0 0 0
MANHATTAN 0 0 0
N/A 0 0 0
QUEENS 0 0 0
STATEN ISLAND 0 0 0
2016 BRONX 6 0 0
BROOKLYN 0 0 0
MANHATTAN 2 0 1
N/A 1 0 0
QUEENS 4 4 0
STATEN ISLAND 1 0 0
2017 BRONX 3 0 0
BROOKLYN 1 3 0
MANHATTAN 1 3 0
N/A 0 0 0
QUEENS 1 1 0
STATEN ISLAND 1 0 0
2018 BRONX 3 0 0
BROOKLYN 1 0 0
MANHATTAN 2 0 0
N/A 0 0 0
QUEENS 2 0 0
STATEN ISLAND 0 0 0
2019 BRONX 0 0 0
BROOKLYN 0 1 0
MANHATTAN 1 0 0
N/A 0 0 0
QUEENS 2 0 0
STATEN ISLAND 1 0 0
In [68]:
Plot_Injuries_vs_Year(df_ksi, col='borough', hue='borough', injured_killed='killed')
In [179]:
a = pivot_table(df_ksi, injured_killed= "killed", index='borough')
a.sort_values(by = 'number of persons killed', ascending = False)
Out[179]:
number of cyclist killed number of motorist killed number of pedestrians killed number of persons killed
borough
BROOKLYN 8 80 43 131.0
QUEENS 1 91 26 118.0
BRONX 1 53 21 75.0
MANHATTAN 2 24 33 59.0
N/A 0 42 17 59.0
STATEN ISLAND 1 24 7 32.0
In [180]:
a = pivot_table(df_ksi, injured_killed= "injured", index='borough')
a.sort_values(by = 'number of persons injured', ascending = False)
Out[180]:
number of cyclist injured number of motorist injured number of pedestrians injured number of persons injured
borough
BROOKLYN 3 217 40 260.0
QUEENS 0 186 24 210.0
BRONX 1 128 25 154.0
MANHATTAN 2 68 56 126.0
N/A 1 104 11 116.0
STATEN ISLAND 1 54 4 59.0
In [181]:
a = pivot_table(df_ksi, injured_killed= "injured", index='road_type')
a.sort_values(by = 'number of persons injured', ascending = False)
Out[181]:
number of cyclist injured number of motorist injured number of pedestrians injured number of persons injured
road_type
avenue 4 258 76 338.0
others 2 126 28 156.0
street 2 91 41 134.0
parkway 0 109 2 111.0
expressway 0 85 4 89.0
N/A 0 39 2 41.0
road 0 29 7 36.0
drive 0 11 0 11.0
bridge 0 9 0 9.0
In [183]:
a = pivot_table(df_ksi, injured_killed= "killed", index='road_type')
a.sort_values(by = 'number of persons killed', ascending = False)
Out[183]:
number of cyclist killed number of motorist killed number of pedestrians killed number of persons killed
road_type
avenue 8 107 59 174.0
others 2 53 25 80.0
street 3 34 40 77.0
parkway 0 50 10 60.0
expressway 0 32 5 37.0
N/A 0 16 4 20.0
road 0 10 4 14.0
bridge 0 6 0 6.0
drive 0 6 0 6.0
In [192]:
Plot_Injuries_vs_Year(df_ksi.loc[df_ksi['road_type'] == 'street'] , col='borough', hue='borough', injured_killed='injured')
In [193]:
Plot_Injuries_vs_Year(df_ksi.loc[df_ksi['road_type'] == 'others'] , col='borough', hue='borough', injured_killed='injured')
In [194]:
Plot_Injuries_vs_Year(df_ksi.loc[df_ksi['road_type'] == 'avenue'] , col='borough', hue='borough', injured_killed='injured')

Assumption 2: the reason for the spike in the 2016 compare to the previous years is because NYPD start implementing the digital reproting system. That is, it's faster and conveninent for the police and citizens to reprot, leading to dramatic increase in the number of reprots

general data

In [111]:
Plot_Injuries_vs_Year(df_col, col=None, hue=None, injured_killed='injured')
In [112]:
Plot_Injuries_vs_Year(df_col, col=None, hue=None, injured_killed='killed')
In [113]:
Plot_Injuries_vs_Year(df_col, col='road_type', hue='road_type', injured_killed='injured')
In [172]:
pivot_table(df_col, injured_killed= "killed", index='borough')
Out[172]:
number of cyclist killed number of motorist killed number of pedestrians killed number of persons killed
borough
BRONX 1 53 21 75.0
BROOKLYN 8 80 43 131.0
MANHATTAN 2 24 33 59.0
N/A 0 42 17 59.0
QUEENS 1 91 26 118.0
STATEN ISLAND 1 24 7 32.0
In [163]:
pivot_table(df= df_col, injured_killed='injured', index='borough')
Out[163]:
number of cyclist injured number of motorist injured number of pedestrians injured number of persons injured
borough
BRONX 1 128 25 154.0
BROOKLYN 3 217 40 260.0
MANHATTAN 2 68 56 126.0
N/A 1 104 11 116.0
QUEENS 0 186 24 210.0
STATEN ISLAND 1 54 4 59.0
In [176]:
pivot_table(df= df_col, injured_killed='killed', index='year')
Out[176]:
number of cyclist killed number of motorist killed number of pedestrians killed number of persons killed
year
2012 0 29 12 41.0
2013 0 49 33 82.0
2014 0 46 15 61.0
2015 3 46 19 68.0
2016 4 34 21 59.0
2017 3 48 21 72.0
2018 0 39 16 55.0
2019 3 23 10 36.0
In [175]:
pivot_table(df= df_col, injured_killed='injured', index='year')
Out[175]:
number of cyclist injured number of motorist injured number of pedestrians injured number of persons injured
year
2012 0 57 11 68.0
2013 0 107 29 136.0
2014 0 96 26 122.0
2015 2 133 14 149.0
2016 1 98 14 113.0
2017 5 129 47 181.0
2018 0 79 13 92.0
2019 0 58 6 64.0
In [174]:
a = pivot_table(df= df_col, injured_killed='injured', index='road_type')
a.sort_values(by= 'number of persons injured', ascending = False)
Out[174]:
number of cyclist injured number of motorist injured number of pedestrians injured number of persons injured
road_type
avenue 4 258 76 338.0
others 2 126 28 156.0
street 2 91 41 134.0
parkway 0 109 2 111.0
expressway 0 85 4 89.0
N/A 0 39 2 41.0
road 0 29 7 36.0
drive 0 11 0 11.0
bridge 0 9 0 9.0
In [173]:
a = pivot_table(df= df_col, injured_killed='killed', index='road_type')
a.sort_values(by= 'number of persons killed', ascending = False)
Out[173]:
number of cyclist killed number of motorist killed number of pedestrians killed number of persons killed
road_type
avenue 8 107 59 174.0
others 2 53 25 80.0
street 3 34 40 77.0
parkway 0 50 10 60.0
expressway 0 32 5 37.0
N/A 0 16 4 20.0
road 0 10 4 14.0
bridge 0 6 0 6.0
drive 0 6 0 6.0
In [118]:
Plot_Injuries_vs_Year(df_col, col='borough', hue='borough', injured_killed='injured')
In [191]:
Plot_Injuries_vs_Year(df_col.loc[df_col['road_type'] == 'street'] , col='borough', hue='borough', injured_killed='injured')
In [190]:
Plot_Injuries_vs_Year(df_col.loc[df_col['road_type'] == 'others'] , col='borough', hue='borough', injured_killed='injured')
In [189]:
Plot_Injuries_vs_Year(df_col.loc[df_col['road_type'] == 'avenue'] , col='borough', hue='borough', injured_killed='injured')
In [119]:
test = df_col.loc[(df_col['number of persons killed'] > 0) & (df_col['number of persons injured'] > 0)]

Plot_Injuries_vs_Year(test, col='borough', hue='borough', injured_killed='injured')
In [184]:
Plot_Injuries_vs_Year(test, col='road_type', hue='road_type', injured_killed='injured')
In [185]:
Plot_Injuries_vs_Year(test, col='road_type', hue='road_type', injured_killed='killed')
In [195]:
Plot_Geoms_On_NYCMap(df_col.loc[df_col['road_type']=='expressway','location'].dropna(), ON_MAP=False)
Plot_Geoms_On_NYCMap(df_col.loc[df_col['road_type']=='parkway','location'].dropna(), ON_MAP=False)
Plot_Geoms_On_NYCMap(df_col.loc[df_col['road_type']=='bridge','location'].dropna(), ON_MAP=False)
Plot_Geoms_On_NYCMap(df_col.loc[df_col['road_type']=='drive','location'].dropna(), ON_MAP=False)

Plot_Geoms_On_NYCMap(df_col.loc[df_col['road_type'].apply(lambda x: x in ['expressway','parkway','bridge','drive']),'location'].dropna())
In [200]:
df_arterial = pd.read_csv("data/dot_VZV_Arterial_Slow_Zones_20190903.csv")
df_bike = pd.read_csv("data/dot_VZV_Bike_Priority_Districts_20190903.csv")
df_cross = pd.read_csv("data/dot_VZV_Enhanced_Crossings_20190903.csv")
df_left = pd.read_csv("data/dot_VZV_Left_Turn_Traffic_Calming_20190903.csv")
df_slow = pd.read_csv("data/dot_VZV_Neighborhood_Slow_Zones_20190903.csv")
df_signal = pd.read_csv("data/dot_VZV_Signal_Timing_20190903.csv")
df_spdhump = pd.read_csv("data/dot_VZV_Speed_Humps_20190903.csv")
df_spdlimit = pd.read_csv("data/dot_VZV_Speed_Limits_20190903.csv")

df_arterial['the_geom'] = df_arterial['the_geom'].apply(lambda x: shapely.wkt.loads(x))
df_bike['the_geom'] = df_bike['the_geom'].apply(lambda x: shapely.wkt.loads(x))
df_cross['the_geom'] = df_cross['the_geom'].apply(lambda x: shapely.wkt.loads(x))
df_left['the_geom'] = df_left['the_geom'].apply(lambda x: shapely.wkt.loads(x))
df_slow['the_geom'] = df_slow['the_geom'].apply(lambda x: shapely.wkt.loads(x))
df_signal['the_geom'] = df_signal['the_geom'].apply(lambda x: shapely.wkt.loads(x))
df_spdhump['the_geom'] = df_spdhump['the_geom'].apply(lambda x: shapely.wkt.loads(x))
df_spdlimit['the_geom'] = df_spdlimit['the_geom'].apply(lambda x: shapely.wkt.loads(x))

if False: 
    is_arterial, geoms_arterial = Is_Event_of_Geom(df_col, df_arterial, num_head = 100000000, THRESHOLD_DISTANCE = 0.0001)
    pickle.dump([is_arterial, geoms_arterial], open('data/is_arterial.pickle', 'wb'))
    # 10000 : 37 s
    # total : 1h 22min 9s
else:
    is_arterial, geoms_arterial = pickle.load(open('data/is_arterial.pickle', 'rb'))        

if False: 
    is_bike, geoms_bike = Is_Event_of_Geom(df_col, df_bike, num_head = 100000000, THRESHOLD_DISTANCE = 0.0001)
    pickle.dump([is_bike, geoms_bike], open('data/is_bike.pickle', 'wb'))
    # 10000 : 11.6 s
    # 100000 : 
else:
    is_bike, geoms_bike = pickle.load(open('data/is_bike.pickle', 'rb'))    

if False: 
    is_cross, geoms_cross = Is_Event_of_Geom(df_col, df_cross, num_head = 100000000, THRESHOLD_DISTANCE = 0.0002)
    pickle.dump([is_cross, geoms_cross], open('data/is_cross.pickle', 'wb'))
    # 10000 : 1.61 s
    # 100000 : 13.7 s
    # total : 3min 45s
else:
    is_cross, geoms_cross = pickle.load(open('data/is_cross.pickle', 'rb'))

if False: 
    is_left, geoms_left = Is_Event_of_Geom(df_col, df_left, num_head = 100000000, THRESHOLD_DISTANCE = 0.0002)
    pickle.dump([is_left, geoms_left], open('data/is_left.pickle', 'wb'))
    # 10000 : 2.63 s
    # 100000 :
    # total : 5min 37s
else:
    is_left, geoms_left = pickle.load(open('data/is_left.pickle', 'rb'))    

if False: 
    is_slow, geoms_slow = Is_Event_of_Geom(df_col, df_slow, num_head = 100000000, THRESHOLD_DISTANCE = 0.0001)
    pickle.dump([is_slow, geoms_slow], open('data/is_slow.pickle', 'wb'))
    # 10000 : 14.2 s
    # total : 25min 41s
else:
    is_slow, geoms_slow = pickle.load(open('data/is_slow.pickle', 'rb'))        

if False: 
    is_signal, geoms_signal = Is_Event_of_Geom(df_col, df_signal, num_head = 100000000, THRESHOLD_DISTANCE = 0.0001)
    pickle.dump([is_signal, geoms_signal], open('data/is_signal.pickle', 'wb'))
    # 10000 : 22.1 s
    # 100000 : 41min 28s
else:
    is_signal, geoms_signal = pickle.load(open('data/is_signal.pickle', 'rb'))            

if False: 
    is_spdhump, geoms_spdhump = Is_Event_of_Geom(df_col, df_spdhump, num_head = 100000000, THRESHOLD_DISTANCE = 0.0001)
    pickle.dump([is_spdhump, geoms_spdhump], open('data/is_spdhump.pickle', 'wb'))
    # 10000 : 14.6 s
    # total: 39min 43s
else:
    is_spdhump, geoms_spdhump = pickle.load(open('data/is_spdhump.pickle', 'rb'))                    
    
# if True: 
#     is_spdlimit, geoms_spdlimit = Is_Event_of_Geom(df_col, df_spdlimit, num_head = 100000000, THRESHOLD_DISTANCE = 0.0001)
#     pickle.dump([is_spdlimit, geoms_spdlimit], open('is_spdlimit.pickle', 'wb'))
#     # 100 : 7min 45s
#     # 1000 : 9min 46s
# else:
#     is_spdlimit, geoms_spdlimit = pickle.load(open('is_spdlimit.pickle', 'rb'))                    

df_col['is_arterial'] = is_arterial
df_col['is_bike'] = is_bike
df_col['is_cross'] = is_cross
df_col['is_left'] = is_left
df_col['is_slow'] = is_slow
df_col['is_signal'] = is_signal
df_col['is_spdhump'] = is_spdhump
# df_col['is_spdlimit'] = is_spdlimit
In [201]:
Plot_Geoms_On_NYCMap(df_arterial['the_geom'], ON_MAP=False)
Plot_Geoms_On_NYCMap(df_col.loc[df_col['is_arterial'],'location'].dropna(), ON_MAP=False)
Plot_Injuries_vs_Year(df_col, col='road_type', hue='is_arterial')

Plot_Geoms_On_NYCMap(df_bike['the_geom'], ON_MAP=False)
Plot_Geoms_On_NYCMap(df_col.loc[df_col['is_bike'],'location'].dropna(), ON_MAP=False)
Plot_Injuries_vs_Year(df_col, col='road_type', hue='is_bike')

Plot_Geoms_On_NYCMap(df_cross['the_geom'], ON_MAP=False)
Plot_Geoms_On_NYCMap(df_col.loc[df_col['is_cross'],'location'].dropna(), ON_MAP=False)
Plot_Injuries_vs_Year(df_col, col='road_type', hue='is_cross')

Plot_Geoms_On_NYCMap(df_left['the_geom'], ON_MAP=False)
Plot_Geoms_On_NYCMap(df_col.loc[df_col['is_left'],'location'].dropna(), ON_MAP=False)
Plot_Injuries_vs_Year(df_col, col='road_type', hue='is_left')

Plot_Geoms_On_NYCMap(df_slow['the_geom'], ON_MAP=False)
Plot_Geoms_On_NYCMap(df_col.loc[df_col['is_slow'],'location'].dropna(), ON_MAP=False)
Plot_Injuries_vs_Year(df_col, col='road_type', hue='is_slow')

Plot_Geoms_On_NYCMap(df_signal['the_geom'], ON_MAP=False)
Plot_Geoms_On_NYCMap(df_col.loc[df_col['is_signal'],'location'].dropna(), ON_MAP=False)
Plot_Injuries_vs_Year(df_col, col='road_type', hue='is_signal')

Plot_Geoms_On_NYCMap(df_spdhump['the_geom'], ON_MAP=False)
Plot_Geoms_On_NYCMap(df_col.loc[df_col['is_spdhump'],'location'].dropna(), ON_MAP=False)
Plot_Injuries_vs_Year(df_col, col='road_type', hue='is_spdhump')

# Plot_Geoms_On_NYCMap(df_spdlimit['the_geom'], ON_MAP=False)
# Plot_Injuries_vs_Year(df_col, col='road_type', hue='is_spdlimit')